import numpy as np
import random
from self_factory import Data2Index

def writetxt(filename, line):
    with open(filename, 'a', encoding='utf-8') as w:
        w.write(line)

def get_datas():
    with open('data/3L_data.txt', 'r', encoding='utf8') as f:
        fline = f.readline()
        datas = []
        num = 0
        while fline:
            num = num +1
            #print('num', num)
            # 过滤数据中数字和符号
            # @：id   内容：#  评论：$
            flined = fline.replace('@', '').replace('#', '').replace('@', '')\
                .replace('id:', '@').replace('<内容', '#').replace('<评论', '$')\
                .replace('\n','')
            a = '</>="-":._▲</p></div>'
            rline = ''

            for w in flined:
                if not w.isdigit() and w not in a and not w.islower():
                    rline = rline + w

            # 将一行数据分拆成评论, 结果是n个评论的list
            if rline == '' or rline[0] != '@':
                fline = f.readline()
            else:
                Commentes = rline.split('$')
                #检查评论list中，每个评论中是否包含文章‘#’，如果包含将该评论用‘#’分割
                line_splited = [comment.split('#') for comment in Commentes ]
                if len(line_splited[-1]) == 2:
                    for n in range(len(line_splited)-1):
                        if len(line_splited[n][0]) > 5:
                            datas.append([line_splited[-1][1].replace(' ', ''), line_splited[n][0].replace(' ', '')])
                            #line = line_splited[-1][1].replace(' ', '')+' '+line_splited[n][0].replace(' ', '')+ '\n'
                            #writetxt('precessed.txt',line)
                            #num = num + 1
                fline = f.readline()
    return datas

def write2file():
    num = 0
    data = get_datas()
    data_index = [i for i in range(len(data))]
    random.shuffle(data_index)

    print('begin write to txt file')

    for i in data_index:
        lines = data[i]

        try:
            lineAs = lines[0].split('。')
            lineA_rand = lineAs[random.randint(0, len(lineAs)-1)]

            lineA = lineA_rand.replace(' ', '').strip(',')
            lineB = lines[1].replace(' ', '').strip(',')

            lineA_ = 'BOS '+' '.join(Data2Index.str2list(lineA)) + ' PAD\n'
            lineB_ = 'BOS '+' '.join(Data2Index.str2list(lineB)) + ' PAD\n'

            writetxt('data/sentence.txt', lineA_)
            writetxt('data/label.txt', lineB_)
            num = num + 1
            if num % 10000 == 0:
                print(num)
            if num == 339000:
                break
        except :
            print(lines)
            pass
    print('final num:', num)

def cut_with_full_stop(linedata):
    isinstance(linedata, list)



write2file()
